import os
import sys
import json

import codecs
codecs.register_error("strict", codecs.replace_errors)


# import jieba


class MySentences(object):
    def __init__(self, dir_name):
        self.dir_name = dir_name

    def __iter__(self):
        for f_name in os.listdir(self.dir_name):
            for line in open(os.path.join(self.dir_name, f_name)):
                # yield jieba.lcut(line)
                out = {}
                try:
                    out = json.loads(line)
                except Exception as e:
                    print(str(e))
                yield out


if __name__ == '__main__':
    text_dir = '/data/data/category_lanmu/at'
    sentences = MySentences(text_dir)
    class1_dict = {}
    class2_dict = {}
    for s in sentences:
        class1 = s.get('article_type_1st', '')
        class2 = s.get('article_type_2nd', '')
        class1_dict[class1] = class1_dict.get(class1, 0) + 1
        class2_dict[class2+class1] = class2_dict.get(class2+class1, 0) + 1

    print(class1_dict)

    print(class2_dict)

    print(len(class1_dict))
    print(len(class2_dict))
